// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.

// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#ifndef _H_MMM_12X8
#define _H_MMM_12X8

void mmm_12x8(U64 offset, U64 K4, INT8 *A, INT8 *B, I32 *C)
{
    __asm__ __volatile__("ld1 {v24.16b, v25.16b}, [%[A]]\n"
                         "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[B]]\n"
                         "mov x20, %[A]\n"
                         "mov x21, %[B]\n"
                         "mov x26, %[C]\n"
                         "mov x22, %[K]\n"

                         "movi v0.16b, #0x0\n"
                         "movi v1.16b, #0x0\n"
                         "movi v2.16b, #0x0\n"
                         "movi v3.16b, #0x0\n"
                         "movi v4.16b, #0x0\n"
                         "movi v5.16b, #0x0\n"
                         "movi v6.16b, #0x0\n"
                         "movi v7.16b, #0x0\n"
                         "movi v8.16b, #0x0\n"
                         "movi v9.16b, #0x0\n"
                         "movi v10.16b, #0x0\n"
                         "movi v11.16b, #0x0\n"
                         "movi v12.16b, #0x0\n"
                         "movi v13.16b, #0x0\n"
                         "movi v14.16b, #0x0\n"
                         "movi v15.16b, #0x0\n"
                         "movi v16.16b, #0x0\n"
                         "movi v17.16b, #0x0\n"
                         "movi v18.16b, #0x0\n"
                         "movi v19.16b, #0x0\n"
                         "movi v20.16b, #0x0\n"
                         "movi v21.16b, #0x0\n"
                         "movi v22.16b, #0x0\n"
                         "movi v23.16b, #0x0\n"

                         "cmp x22, #1\n"
                         "ble 1f\n"

                         "0:\n"
                         ".inst 0x4e9ca700  // smmla v0.4s, v24.16b, v28.16b\n"
                         ".inst 0x4e9da701  // smmla v1.4s, v24.16b, v29.16b\n"
                         ".inst 0x4e9ca724  // smmla v4.4s, v25.16b, v28.16b\n"
                         ".inst 0x4e9da725  // smmla v5.4s, v25.16b, v29.16b\n"
                         "ldr q26, [x20, 0x20]\n"
                         "ldr q27, [x20, 0x30]\n"
                         ".inst 0x4e9ea702  // smmla v2.4s, v24.16b, v30.16b\n"
                         ".inst 0x4e9fa703  // smmla v3.4s, v24.16b, v31.16b\n"
                         "ldr q24, [x20, 0x40]\n"
                         ".inst 0x4e9ea726  // smmla v6.4s, v25.16b, v30.16b\n"
                         ".inst 0x4e9fa727  // smmla v7.4s, v25.16b, v31.16b\n"
                         ".inst 0x4e9ca748  // smmla v8.4s, v26.16b, v28.16b\n"
                         "ldr q25, [x20, 0x50]\n"
                         ".inst 0x4e9da749  // smmla v9.4s, v26.16b, v29.16b\n"
                         ".inst 0x4e9ea74a  // smmla v10.4s, v26.16b, v30.16b\n"
                         ".inst 0x4e9fa74b  // smmla v11.4s, v26.16b, v31.16b\n"
                         "ldr q26, [x20, 0x60]\n"
                         ".inst 0x4e9ca76c  // smmla v12.4s, v27.16b, v28.16b\n"
                         ".inst 0x4e9ca710  // smmla v16.4s, v24.16b, v28.16b\n"
                         ".inst 0x4e9ca734  // smmla v20.4s, v25.16b, v28.16b\n"
                         "ldr q28, [x21, 0x40]\n"
                         ".inst 0x4e9da76d  // smmla v13.4s, v27.16b, v29.16b\n"
                         ".inst 0x4e9ea76e  // smmla v14.4s, v27.16b, v30.16b\n"
                         ".inst 0x4e9fa76f  // smmla v15.4s, v27.16b, v31.16b\n"
                         "ldr q27, [x20, 0x70]\n"
                         ".inst 0x4e9da711  // smmla v17.4s, v24.16b, v29.16b\n"
                         ".inst 0x4e9da735  // smmla v21.4s, v25.16b, v29.16b\n"
                         "ldr q29, [x21, 0x50]\n"
                         ".inst 0x4e9ea712  // smmla v18.4s, v24.16b, v30.16b\n"
                         ".inst 0x4e9ea736  // smmla v22.4s, v25.16b, v30.16b\n"
                         "ldr q30, [x21, 0x60]\n"
                         ".inst 0x4e9fa713  // smmla v19.4s, v24.16b, v31.16b\n"
                         ".inst 0x4e9fa737  // smmla v23.4s, v25.16b, v31.16b\n"
                         "ldr q24, [x20, 0x80]\n"
                         "sub x22, x22, #2\n"

                         ".inst 0x4e9ca740  // smmla v0.4s, v26.16b, v28.16b\n"
                         ".inst 0x4e9ca764  // smmla v4.4s, v27.16b, v28.16b\n"
                         "ldr q31, [x21, 0x70]\n"
                         ".inst 0x4e9da741  // smmla v1.4s, v26.16b, v29.16b\n"
                         ".inst 0x4e9da765  // smmla v5.4s, v27.16b, v29.16b\n"
                         "ldr q25, [x20, 0x90]\n"
                         ".inst 0x4e9ea742  // smmla v2.4s, v26.16b, v30.16b\n"
                         ".inst 0x4e9ea766  // smmla v6.4s, v27.16b, v30.16b\n"
                         ".inst 0x4e9fa743  // smmla v3.4s, v26.16b, v31.16b\n"
                         "ldr q26, [x20, 0xa0]\n"
                         ".inst 0x4e9fa767  // smmla v7.4s, v27.16b, v31.16b\n"
                         ".inst 0x4e9ca708  // smmla v8.4s, v24.16b, v28.16b\n"
                         "ldr q27, [x20, 0xb0]\n"
                         ".inst 0x4e9da709  // smmla v9.4s, v24.16b, v29.16b\n"
                         ".inst 0x4e9ea70a  // smmla v10.4s, v24.16b, v30.16b\n"
                         ".inst 0x4e9fa70b  // smmla v11.4s, v24.16b, v31.16b\n"
                         "ldr q24, [x20, 0xc0]\n"
                         ".inst 0x4e9ca72c  // smmla v12.4s, v25.16b, v28.16b\n"
                         ".inst 0x4e9ca750  // smmla v16.4s, v26.16b, v28.16b\n"
                         ".inst 0x4e9ca774  // smmla v20.4s, v27.16b, v28.16b\n"
                         "ldr q28, [x21, 0x80]\n"
                         ".inst 0x4e9da72d  // smmla v13.4s, v25.16b, v29.16b\n"
                         ".inst 0x4e9ea72e  // smmla v14.4s, v25.16b, v30.16b\n"
                         ".inst 0x4e9fa72f  // smmla v15.4s, v25.16b, v31.16b\n"
                         "ldr q25, [x20, 0xd0]\n"
                         ".inst 0x4e9da751  // smmla v17.4s, v26.16b, v29.16b\n"
                         ".inst 0x4e9da775  // smmla v21.4s, v27.16b, v29.16b\n"
                         "ldr q29, [x21, 0x90]\n"
                         ".inst 0x4e9ea752  // smmla v18.4s, v26.16b, v30.16b\n"
                         ".inst 0x4e9ea776  // smmla v22.4s, v27.16b, v30.16b\n"
                         "ldr q30, [x21, 0xa0]\n"
                         ".inst 0x4e9fa753  // smmla v19.4s, v26.16b, v31.16b\n"
                         ".inst 0x4e9fa777  // smmla v23.4s, v27.16b, v31.16b\n"
                         "ldr q31, [x21, 0xb0]\n"
                         "add x20, x20, 0xc0\n"
                         "add x21, x21, 0x80\n"
                         "cmp x22, #1\n"
                         "bgt 0b\n"
                         "1:\n"
                         "bne 2f\n"
                         ".inst 0x4e9ca700  // smmla v0.4s, v24.16b, v28.16b\n"
                         ".inst 0x4e9da701  // smmla v1.4s, v24.16b, v29.16b\n"
                         ".inst 0x4e9ca724  // smmla v4.4s, v25.16b, v28.16b\n"
                         ".inst 0x4e9da725  // smmla v5.4s, v25.16b, v29.16b\n"
                         "ldr q26, [x20, 0x20]\n"
                         "ldr q27, [x20, 0x30]\n"
                         ".inst 0x4e9ea702  // smmla v2.4s, v24.16b, v30.16b\n"
                         ".inst 0x4e9fa703  // smmla v3.4s, v24.16b, v31.16b\n"
                         "ldr q24, [x20, 0x40]\n"
                         ".inst 0x4e9ea726  // smmla v6.4s, v25.16b, v30.16b\n"
                         ".inst 0x4e9fa727  // smmla v7.4s, v25.16b, v31.16b\n"
                         ".inst 0x4e9ca748  // smmla v8.4s, v26.16b, v28.16b\n"
                         "ldr q25, [x20, 0x50]\n"
                         ".inst 0x4e9da749  // smmla v9.4s, v26.16b, v29.16b\n"
                         ".inst 0x4e9ea74a  // smmla v10.4s, v26.16b, v30.16b\n"
                         ".inst 0x4e9fa74b  // smmla v11.4s, v26.16b, v31.16b\n"
                         ".inst 0x4e9ca76c  // smmla v12.4s, v27.16b, v28.16b\n"
                         ".inst 0x4e9ca710  // smmla v16.4s, v24.16b, v28.16b\n"
                         ".inst 0x4e9ca734  // smmla v20.4s, v25.16b, v28.16b\n"
                         ".inst 0x4e9da76d  // smmla v13.4s, v27.16b, v29.16b\n"
                         ".inst 0x4e9ea76e  // smmla v14.4s, v27.16b, v30.16b\n"
                         ".inst 0x4e9fa76f  // smmla v15.4s, v27.16b, v31.16b\n"
                         ".inst 0x4e9da711  // smmla v17.4s, v24.16b, v29.16b\n"
                         ".inst 0x4e9da735  // smmla v21.4s, v25.16b, v29.16b\n"
                         ".inst 0x4e9ea712  // smmla v18.4s, v24.16b, v30.16b\n"
                         ".inst 0x4e9ea736  // smmla v22.4s, v25.16b, v30.16b\n"
                         ".inst 0x4e9fa713  // smmla v19.4s, v24.16b, v31.16b\n"
                         ".inst 0x4e9fa737  // smmla v23.4s, v25.16b, v31.16b\n"

                         "2:\n"
                         "ld1 {v30.4s, v31.4s}, [x26]\n"
                         "uzp1 v24.2d, v0.2d, v1.2d\n"
                         "uzp1 v25.2d, v2.2d, v3.2d\n"
                         "uzp2 v26.2d, v0.2d, v1.2d\n"
                         "uzp2 v27.2d, v2.2d, v3.2d\n"
                         "uzp1 v28.2d, v4.2d, v5.2d\n"
                         "add v24.4s, v24.4s, v30.4s\n"
                         "add v25.4s, v25.4s, v31.4s\n"
                         "uzp2 v30.2d, v4.2d, v5.2d\n"
                         "uzp1 v29.2d, v6.2d, v7.2d\n"
                         "uzp2 v31.2d, v6.2d, v7.2d\n"
                         "st1 {v24.4s, v25.4s}, [x26]\n"
                         "add x26, x26, %[offset]\n"
                         "ld1 {v24.4s, v25.4s}, [x26]\n"
                         "uzp1 v0.2d, v8.2d, v9.2d\n"
                         "uzp2 v2.2d, v8.2d, v9.2d\n"
                         "uzp1 v1.2d, v10.2d, v11.2d\n"
                         "uzp2 v3.2d, v10.2d, v11.2d\n"
                         "add v26.4s, v26.4s, v24.4s\n"
                         "add v27.4s, v27.4s, v25.4s\n"
                         "uzp1 v4.2d, v12.2d, v13.2d\n"
                         "uzp2 v6.2d, v12.2d, v13.2d\n"
                         "uzp1 v5.2d, v14.2d, v15.2d\n"
                         "uzp2 v7.2d, v14.2d, v15.2d\n"
                         "st1 {v26.4s, v27.4s}, [x26]\n"
                         "add x26, x26, %[offset]\n"
                         "ld1 {v24.4s, v25.4s}, [x26]\n"
                         "uzp1 v8.2d, v16.2d, v17.2d\n"
                         "uzp2 v10.2d, v16.2d, v17.2d\n"
                         "uzp1 v9.2d, v18.2d, v19.2d\n"
                         "uzp2 v11.2d, v18.2d, v19.2d\n"
                         "add v28.4s, v28.4s, v24.4s\n"
                         "add v29.4s, v29.4s, v25.4s\n"
                         "uzp1 v12.2d, v20.2d, v21.2d\n"
                         "uzp2 v14.2d, v20.2d, v21.2d\n"
                         "st1 {v28.4s, v29.4s}, [x26]\n"
                         "add x26, x26, %[offset]\n"
                         "ld1 {v24.4s, v25.4s}, [x26]\n"
                         "uzp1 v13.2d, v22.2d, v23.2d\n"
                         "uzp2 v15.2d, v22.2d, v23.2d\n"
                         "add v30.4s, v30.4s, v24.4s\n"
                         "add v31.4s, v31.4s, v25.4s\n"
                         "st1 {v30.4s, v31.4s}, [x26]\n"
                         "add x26, x26, %[offset]\n"

                         "ld1 {v24.4s, v25.4s}, [x26]\n"
                         "add v0.4s, v0.4s, v24.4s\n"
                         "add v1.4s, v1.4s, v25.4s\n"
                         "st1 { v0.4s,  v1.4s}, [x26]\n"
                         "add x26, x26, %[offset]\n"

                         "ld1 {v24.4s, v25.4s}, [x26]\n"
                         "add v2.4s, v2.4s, v24.4s\n"
                         "add v3.4s, v3.4s, v25.4s\n"
                         "st1 { v2.4s,  v3.4s}, [x26]\n"
                         "add x26, x26, %[offset]\n"

                         "ld1 {v24.4s, v25.4s}, [x26]\n"
                         "add v4.4s, v4.4s, v24.4s\n"
                         "add v5.4s, v5.4s, v25.4s\n"
                         "st1 { v4.4s,  v5.4s}, [x26]\n"
                         "add x26, x26, %[offset]\n"

                         "ld1 {v24.4s, v25.4s}, [x26]\n"
                         "add v6.4s, v6.4s, v24.4s\n"
                         "add v7.4s, v7.4s, v25.4s\n"
                         "st1 { v6.4s,  v7.4s}, [x26]\n"
                         "add x26, x26, %[offset]\n"

                         "ld1 {v24.4s, v25.4s}, [x26]\n"
                         "add v8.4s, v8.4s, v24.4s\n"
                         "add v9.4s, v9.4s, v25.4s\n"
                         "st1 { v8.4s,  v9.4s}, [x26]\n"
                         "add x26, x26, %[offset]\n"

                         "ld1 {v24.4s, v25.4s}, [x26]\n"
                         "add v10.4s, v10.4s, v24.4s\n"
                         "add v11.4s, v11.4s, v25.4s\n"
                         "st1 {v10.4s, v11.4s}, [x26]\n"
                         "add x26, x26, %[offset]\n"

                         "ld1 {v24.4s, v25.4s}, [x26]\n"
                         "add v12.4s, v12.4s, v24.4s\n"
                         "add v13.4s, v13.4s, v25.4s\n"
                         "st1 {v12.4s, v13.4s}, [x26]\n"
                         "add x26, x26, %[offset]\n"

                         "ld1 {v24.4s, v25.4s}, [x26]\n"
                         "add v14.4s, v14.4s, v24.4s\n"
                         "add v15.4s, v15.4s, v25.4s\n"
                         "st1 {v14.4s, v15.4s}, [x26]\n"
                         : [A] "+r"(A), [B] "+r"(B), [C] "+r"(C)
                         : [K] "r"(K4), [offset] "r"(offset)
                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
                         "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
                         "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
                         "v30", "v31", "x19", "x20", "x21", "x22", "x26");
}
#endif
